This notebook will guide you through implementing a custom layer in neon, as well as a custom activation function. You will learn
The first step is to set up our compute backend, and initialize our dataset.
In [ ]:
import neon
# use a GPU backend
from neon.backends import gen_backend
be = gen_backend('gpu', batch_size=128)
# load data
mnist = MNIST(path='data/')
train_set = mnist.train_iter
test_set = mnist.valid_iter
Instead of importing the neon supplied Affine Layer, we will instead build our own.
Note: Affine is actually a compound layer; it bundles a linear layer with a bias transform and an activation function. The Linear layer is what implements a fully connected layer.
First, lets build our own linear layer, called MyLinear, and then we will wrap that layer in a compound layer MyAffine.
There are several important components to a layer in neon:
configure: during model initialization, this layer will receive the previous layer's object and use it to set this model's in_shape and out_shape attributes.allocate: after each layer's shape is configured, this layer's shape information will be used to allocate memory for the output activations from fprop.fprop: forward propagation. Should return a tensor with shape equal to the layer's out_shape attribute.bprop: backward propagation.In the implementation below, fprop is implemented using element-wise operations. It will be very slow. Try replacing it with the neon backend implementation of compound_dot, such as in the bprop function.
In [ ]:
from neon.layers.layer import ParameterLayer, interpret_in_shape
# Subclass from ParameterLayer, which handles the allocation
# of memory buffers for the output activations, weights, and
# bprop deltas.
class MyLinear(ParameterLayer):
def __init__(self, nout, init, name=None):
super(MyLinear, self).__init__(init, name, "Disabled")
self.nout = nout
def __str__(self):
return "Linear Layer '%s': %d inputs, %d outputs" % (
self.name, self.nin, self.nout)
def configure(self, in_obj):
super(MyLinear, self).configure(in_obj)
# shape of the input is in (# input features, batch_size)
(self.nin, self.nsteps) = interpret_in_shape(self.in_shape)
# shape of the output is (# output units, batch_size)
self.out_shape = (self.nout, self.nsteps)
# if the shape of the weights have not been allocated,
# we know that his layer's W is a tensor of shape (# outputs, # inputs).
if self.weight_shape is None:
self.weight_shape = (self.nout, self.nin)
return self
def fprop(self, inputs, inference=False, beta=0.0):
self.inputs = inputs
# here we compute y = W*X inefficiently using the backend functions
for r in range(self.outputs.shape[0]):
for c in range(self.outputs.shape[1]):
self.outputs[r,c] = self.be.sum(self.W[r,:] * inputs[:,c].T)
# TODO:
# try substituting the for loops above with the backend `compound_dot`
# function to see the speed-up from using a custom gpu kernel!
# self.be.compound_dot(A=self.W, B=inputs, C=self.outputs)
return self.outputs
def bprop(self, error, alpha=1.0, beta=0.0):
# to save you headache, we use the backend compound_dot function here to compute
# the back-propogated deltas = W^T*error.
if self.deltas:
self.be.compound_dot(A=self.W.T, B=error, C=self.deltas, alpha=alpha, beta=beta)
self.be.compound_dot(A=error, B=self.inputs.T, C=self.dW)
return self.deltas
Wrap the above layer in a container, which bundles an activation and batch normalization.
In [ ]:
from neon.initializers import Gaussian
from neon.models import Model
from neon.layers.layer import Activation
from neon.transforms.activation import Rectlin, Softmax
init_norm = Gaussian(loc=0.0, scale=0.01)
# assemble all of the pieces
layers = []
layers.append(MyLinear(nout=100, init=init_norm, name="Linear100"))
layers.append(Activation(Rectlin()))
layers.append(MyLinear(nout=10, init=init_norm, name="Linear10"))
layers.append(Activation(Softmax()))
# initialize model object
mlp = Model(layers=layers)
In [ ]:
from neon.layers import GeneralizedCost
from neon.transforms import CrossEntropyMulti
from neon.optimizers import GradientDescentMomentum
from neon.callbacks.callbacks import Callbacks
cost = GeneralizedCost(costfunc=CrossEntropyMulti())
optimizer = GradientDescentMomentum(0.1, momentum_coef=0.9)
callbacks = Callbacks(mlp, eval_set=test_set)
mlp.fit(train_set, optimizer=optimizer, num_epochs=10, cost=cost,
callbacks=callbacks)